set.seed(1)
project_2_data =
read_csv("./data/project_2_data.csv", na = c("NA", "", ".")) |>
janitor::clean_names() |>
mutate(status = ifelse(status == "Dead", 1, 0))
## Rows: 4024 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Race, Marital Status, T Stage, N Stage, 6th Stage, differentiate, ...
## dbl (5): Age, Tumor Size, Regional Node Examined, Reginol Node Positive, Su...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dead <- project_2_data |>
filter(status == 1)
alive <- project_2_data |>
filter(status == 0) |>
sample_n(nrow(dead))
project_2_numdata<-
bind_rows(dead, alive) |>
mutate(
t_stage = case_when(
t_stage == "T1" ~ 1,
t_stage == "T2" ~ 2,
t_stage == "T3" ~ 3,
t_stage == "T4" ~ 4,
TRUE ~ NA_real_),
n_stage = case_when(
n_stage == "N1" ~ 1,
n_stage == "N2" ~ 2,
n_stage == "N3" ~ 3,
TRUE ~ NA_real_),
x6th_stage_num = case_when(
x6th_stage == "IIA" ~ 1,
x6th_stage == "IIB" ~ 2,
x6th_stage == "IIIA" ~ 3,
x6th_stage == "IIIB" ~ 4,
x6th_stage == "IIIC" ~ 5,
TRUE ~ NA_real_),
differentiate = case_when(
differentiate == "Well differentiated" ~ 1,
differentiate == "Moderately differentiated" ~ 2,
differentiate == "Poorly differentiated" ~ 3,
differentiate == "Undifferentiated" ~ 4,
TRUE ~ NA_real_),
grade = case_when(
grade == "anaplastic; Grade IV" ~ 4,
grade == "3" ~ 3,
grade == "2" ~ 2,
grade == "1" ~ 1,
TRUE ~ NA_real_),
a_stage_regional = ifelse(a_stage == "Regional", 1, 0),
estrogen_status = ifelse(estrogen_status == "Positive", 1, 0),
progesterone_status = ifelse(progesterone_status == "Positive", 1, 0)
) |>
select(-a_stage)
surv_object <- Surv(time = project_2_numdata$survival_months, event = project_2_numdata$status)
variables <- names(project_2_numdata)[!names(project_2_numdata) %in% c("survival_months", "status")]
for (var in variables) {
formula <- as.formula(paste("Surv(survival_months, status) ~", var))
model <- coxph(formula, data = project_2_numdata)
print(var)
print(summary(model))
}
## [1] "age"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.013313 1.013402 0.004454 2.989 0.0028 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.013 0.9868 1.005 1.022
##
## Concordance= 0.536 (se = 0.012 )
## Likelihood ratio test= 9.05 on 1 df, p=0.003
## Wald test = 8.93 on 1 df, p=0.003
## Score (logrank) test = 8.95 on 1 df, p=0.003
##
## [1] "race"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## raceOther -0.5965 0.5508 0.2099 -2.842 0.00448 **
## raceWhite -0.3263 0.7216 0.1252 -2.607 0.00914 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## raceOther 0.5508 1.816 0.3650 0.8310
## raceWhite 0.7216 1.386 0.5646 0.9222
##
## Concordance= 0.527 (se = 0.008 )
## Likelihood ratio test= 9.46 on 2 df, p=0.009
## Wald test = 9.81 on 2 df, p=0.007
## Score (logrank) test = 9.92 on 2 df, p=0.007
##
## [1] "marital_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## marital_statusMarried -0.23876 0.78761 0.11794 -2.024 0.0429 *
## marital_statusSeparated 0.53873 1.71383 0.27908 1.930 0.0536 .
## marital_statusSingle -0.10841 0.89726 0.14404 -0.753 0.4517
## marital_statusWidowed 0.04877 1.04998 0.17757 0.275 0.7836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## marital_statusMarried 0.7876 1.2697 0.6251 0.9924
## marital_statusSeparated 1.7138 0.5835 0.9918 2.9616
## marital_statusSingle 0.8973 1.1145 0.6766 1.1899
## marital_statusWidowed 1.0500 0.9524 0.7414 1.4871
##
## Concordance= 0.537 (se = 0.011 )
## Likelihood ratio test= 12.55 on 4 df, p=0.01
## Wald test = 14 on 4 df, p=0.007
## Score (logrank) test = 14.36 on 4 df, p=0.006
##
## [1] "t_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## t_stage 0.38440 1.46874 0.04713 8.156 3.48e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## t_stage 1.469 0.6809 1.339 1.611
##
## Concordance= 0.582 (se = 0.011 )
## Likelihood ratio test= 62.94 on 1 df, p=2e-15
## Wald test = 66.51 on 1 df, p=3e-16
## Score (logrank) test = 67.36 on 1 df, p=2e-16
##
## [1] "n_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## n_stage 0.61870 1.85652 0.04835 12.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## n_stage 1.857 0.5386 1.689 2.041
##
## Concordance= 0.623 (se = 0.011 )
## Likelihood ratio test= 152 on 1 df, p=<2e-16
## Wald test = 163.7 on 1 df, p=<2e-16
## Score (logrank) test = 175.6 on 1 df, p=<2e-16
##
## [1] "x6th_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## x6th_stageIIB 0.4390 1.5512 0.1336 3.287 0.00101 **
## x6th_stageIIIA 0.7669 2.1530 0.1260 6.088 1.15e-09 ***
## x6th_stageIIIB 1.4792 4.3893 0.2464 6.003 1.94e-09 ***
## x6th_stageIIIC 1.5199 4.5718 0.1274 11.934 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## x6th_stageIIB 1.551 0.6447 1.194 2.015
## x6th_stageIIIA 2.153 0.4645 1.682 2.756
## x6th_stageIIIB 4.389 0.2278 2.708 7.115
## x6th_stageIIIC 4.572 0.2187 3.562 5.868
##
## Concordance= 0.637 (se = 0.012 )
## Likelihood ratio test= 171.5 on 4 df, p=<2e-16
## Wald test = 177.7 on 4 df, p=<2e-16
## Score (logrank) test = 198.8 on 4 df, p=<2e-16
##
## [1] "differentiate"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## differentiate 0.44124 1.55463 0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## differentiate 1.555 0.6432 1.37 1.764
##
## Concordance= 0.574 (se = 0.011 )
## Likelihood ratio test= 48.05 on 1 df, p=4e-12
## Wald test = 47.04 on 1 df, p=7e-12
## Score (logrank) test = 47.37 on 1 df, p=6e-12
##
## [1] "grade"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## grade 0.44124 1.55463 0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## grade 1.555 0.6432 1.37 1.764
##
## Concordance= 0.574 (se = 0.011 )
## Likelihood ratio test= 48.05 on 1 df, p=4e-12
## Wald test = 47.04 on 1 df, p=7e-12
## Score (logrank) test = 47.37 on 1 df, p=6e-12
##
## [1] "tumor_size"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## tumor_size 0.011933 1.012005 0.001574 7.58 3.46e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## tumor_size 1.012 0.9881 1.009 1.015
##
## Concordance= 0.592 (se = 0.012 )
## Likelihood ratio test= 50.4 on 1 df, p=1e-12
## Wald test = 57.45 on 1 df, p=3e-14
## Score (logrank) test = 58.25 on 1 df, p=2e-14
##
## [1] "estrogen_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## estrogen_status -0.9108 0.4022 0.1064 -8.562 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## estrogen_status 0.4022 2.486 0.3265 0.4954
##
## Concordance= 0.565 (se = 0.008 )
## Likelihood ratio test= 60.02 on 1 df, p=9e-15
## Wald test = 73.31 on 1 df, p=<2e-16
## Score (logrank) test = 78.48 on 1 df, p=<2e-16
##
## [1] "progesterone_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## progesterone_status -0.71314 0.49010 0.08583 -8.308 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## progesterone_status 0.4901 2.04 0.4142 0.5799
##
## Concordance= 0.588 (se = 0.01 )
## Likelihood ratio test= 62.82 on 1 df, p=2e-15
## Wald test = 69.03 on 1 df, p=<2e-16
## Score (logrank) test = 71.97 on 1 df, p=<2e-16
##
## [1] "regional_node_examined"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## regional_node_examined 0.010506 1.010561 0.004982 2.109 0.035 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## regional_node_examined 1.011 0.9895 1.001 1.02
##
## Concordance= 0.521 (se = 0.013 )
## Likelihood ratio test= 4.35 on 1 df, p=0.04
## Wald test = 4.45 on 1 df, p=0.03
## Score (logrank) test = 4.44 on 1 df, p=0.04
##
## [1] "reginol_node_positive"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## reginol_node_positive 0.054926 1.056462 0.004627 11.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## reginol_node_positive 1.056 0.9466 1.047 1.066
##
## Concordance= 0.627 (se = 0.012 )
## Likelihood ratio test= 106.3 on 1 df, p=<2e-16
## Wald test = 140.9 on 1 df, p=<2e-16
## Score (logrank) test = 148 on 1 df, p=<2e-16
##
## [1] "x6th_stage_num"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## x6th_stage_num 0.37716 1.45813 0.02834 13.31 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## x6th_stage_num 1.458 0.6858 1.379 1.541
##
## Concordance= 0.637 (se = 0.012 )
## Likelihood ratio test= 169.3 on 1 df, p=<2e-16
## Wald test = 177.1 on 1 df, p=<2e-16
## Score (logrank) test = 187 on 1 df, p=<2e-16
##
## [1] "a_stage_regional"
## Call:
## coxph(formula = formula, data = project_2_numdata)
##
## n= 1232, number of events= 616
##
## coef exp(coef) se(coef) z Pr(>|z|)
## a_stage_regional -1.1046 0.3314 0.1747 -6.324 2.55e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## a_stage_regional 0.3314 3.018 0.2353 0.4666
##
## Concordance= 0.522 (se = 0.005 )
## Likelihood ratio test= 29.49 on 1 df, p=6e-08
## Wald test = 39.99 on 1 df, p=3e-10
## Score (logrank) test = 44.22 on 1 df, p=3e-11
Based on the univariate Cox model results, the final model should include statistically significant and clinically relevant variables. These include age (p = 0.0028, HR = 1.013), race (p < 0.01, with disparities for Other and White), marital status (significant for Married, p = 0.0429, HR = 0.788), t_stage (p < 2e-16, HR = 1.469), n_stage (p < 2e-16, HR = 1.857), tumor size (p < 2e-14, HR = 1.012), differentiate (p < 7e-12, HR = 1.555), 6th_stage (significant for all categories, HR = 4.572 for IIIC), estrogen status (p < 2e-16, HR = 0.402), progesterone status (p < 2e-16, HR = 0.490), regional node positive (p < 2e-16, HR = 1.056), and a_stage (regional) (p < 3e-10, HR = 0.331). Variables like Single and Widowed marital status (p > 0.05) should be excluded. Further checks for multicollinearity and clinical relevance are necessary for final variable selection.
cor_matrix <- cor(project_2_numdata[, sapply(project_2_numdata, is.numeric)])
print(cor_matrix)
## age t_stage n_stage differentiate
## age 1.00000000 -0.05011340 -0.01706295 -0.12321448
## t_stage -0.05011340 1.00000000 0.30612098 0.15076838
## n_stage -0.01706295 0.30612098 1.00000000 0.17991601
## differentiate -0.12321448 0.15076838 0.17991601 1.00000000
## grade -0.12321448 0.15076838 0.17991601 1.00000000
## tumor_size -0.07953275 0.78499755 0.30277440 0.14403058
## estrogen_status 0.12111658 -0.10672267 -0.12700846 -0.26247316
## progesterone_status 0.01233580 -0.07197461 -0.11108594 -0.21201525
## regional_node_examined -0.03141638 0.08844292 0.36808270 0.07651544
## reginol_node_positive 0.01628318 0.24426697 0.82874594 0.15163577
## survival_months -0.08599693 -0.15971602 -0.24737291 -0.15052637
## status 0.06976636 0.22525091 0.34556514 0.19151006
## x6th_stage_num -0.01872612 0.57138048 0.90805991 0.19138867
## a_stage_regional 0.04777171 -0.23467556 -0.25575034 -0.04874199
## grade tumor_size estrogen_status
## age -0.12321448 -0.07953275 0.1211166
## t_stage 0.15076838 0.78499755 -0.1067227
## n_stage 0.17991601 0.30277440 -0.1270085
## differentiate 1.00000000 0.14403058 -0.2624732
## grade 1.00000000 0.14403058 -0.2624732
## tumor_size 0.14403058 1.00000000 -0.1230207
## estrogen_status -0.26247316 -0.12302067 1.0000000
## progesterone_status -0.21201525 -0.08438705 0.5850460
## regional_node_examined 0.07651544 0.11473470 -0.0414043
## reginol_node_positive 0.15163577 0.24878553 -0.1008736
## survival_months -0.15052637 -0.16151178 0.2272740
## status 0.19151006 0.19535827 -0.1757951
## x6th_stage_num 0.19138867 0.48127728 -0.1392492
## a_stage_regional -0.04874199 -0.12796649 0.1279901
## progesterone_status regional_node_examined
## age 0.01233580 -0.03141638
## t_stage -0.07197461 0.08844292
## n_stage -0.11108594 0.36808270
## differentiate -0.21201525 0.07651544
## grade -0.21201525 0.07651544
## tumor_size -0.08438705 0.11473470
## estrogen_status 0.58504596 -0.04140430
## progesterone_status 1.00000000 -0.01390137
## regional_node_examined -0.01390137 1.00000000
## reginol_node_positive -0.07105526 0.49395643
## survival_months 0.21200308 -0.03087185
## status -0.20001424 0.06199862
## x6th_stage_num -0.11070618 0.35066402
## a_stage_regional 0.07311096 -0.04237809
## reginol_node_positive survival_months status
## age 0.01628318 -0.08599693 0.06976636
## t_stage 0.24426697 -0.15971602 0.22525091
## n_stage 0.82874594 -0.24737291 0.34556514
## differentiate 0.15163577 -0.15052637 0.19151006
## grade 0.15163577 -0.15052637 0.19151006
## tumor_size 0.24878553 -0.16151178 0.19535827
## estrogen_status -0.10087358 0.22727403 -0.17579507
## progesterone_status -0.07105526 0.21200308 -0.20001424
## regional_node_examined 0.49395643 -0.03087185 0.06199862
## reginol_node_positive 1.00000000 -0.21263747 0.31134369
## survival_months -0.21263747 1.00000000 -0.58302039
## status 0.31134369 -0.58302039 1.00000000
## x6th_stage_num 0.78335778 -0.25969095 0.36006577
## a_stage_regional -0.19505294 0.13738966 -0.13123516
## x6th_stage_num a_stage_regional
## age -0.01872612 0.04777171
## t_stage 0.57138048 -0.23467556
## n_stage 0.90805991 -0.25575034
## differentiate 0.19138867 -0.04874199
## grade 0.19138867 -0.04874199
## tumor_size 0.48127728 -0.12796649
## estrogen_status -0.13924922 0.12799010
## progesterone_status -0.11070618 0.07311096
## regional_node_examined 0.35066402 -0.04237809
## reginol_node_positive 0.78335778 -0.19505294
## survival_months -0.25969095 0.13738966
## status 0.36006577 -0.13123516
## x6th_stage_num 1.00000000 -0.29133616
## a_stage_regional -0.29133616 1.00000000
cortable<-project_2_numdata|>
select(-race, -marital_status,-x6th_stage)
cortable$regional_node_examined
## [1] 9 9 12 15 16 23 24 8 5 21 4 3 33 7 11 3 16 8 28 20 14 23 20 13
## [25] 15 14 20 17 11 23 3 34 1 17 17 4 12 27 14 20 28 29 18 3 4 13 23 18
## [49] 7 41 11 10 27 19 5 24 2 12 16 15 40 9 10 19 17 3 19 16 7 25 1 9
## [73] 10 15 9 11 29 21 17 17 7 15 22 47 22 14 9 9 13 14 10 19 15 6 2 54
## [97] 5 31 28 21 19 8 13 13 9 18 12 15 27 24 5 7 22 17 16 12 9 23 10 6
## [121] 19 5 15 10 30 18 15 27 14 3 36 16 5 9 3 32 31 21 18 10 3 20 10 25
## [145] 9 14 9 29 32 19 27 1 11 13 14 21 11 15 16 22 2 14 17 5 10 23 10 3
## [169] 17 18 18 23 9 9 8 23 19 12 9 26 12 15 23 9 14 22 16 30 4 2 10 13
## [193] 1 10 4 15 7 5 28 12 18 28 18 12 6 18 26 3 21 36 26 18 18 28 16 30
## [217] 2 1 21 14 21 4 10 16 19 15 29 15 34 23 27 17 13 12 3 13 14 36 11 12
## [241] 9 9 5 13 11 20 10 14 16 47 15 16 29 16 9 7 11 6 16 18 22 18 4 25
## [265] 29 1 13 13 9 21 25 10 2 8 21 2 26 17 4 9 13 16 10 13 1 6 9 9
## [289] 18 22 8 15 8 4 4 18 12 17 15 19 16 27 15 10 14 13 12 11 8 14 21 13
## [313] 17 4 19 12 17 7 25 37 8 24 13 22 15 3 13 3 2 19 12 11 11 2 20 12
## [337] 19 11 15 12 21 10 10 17 3 15 17 15 17 11 8 18 21 18 10 29 9 18 13 20
## [361] 35 23 12 7 32 18 18 5 28 10 10 18 11 9 9 1 30 3 7 9 19 57 5 22
## [385] 12 15 12 3 8 15 25 10 4 26 25 16 23 13 17 8 24 9 10 9 36 19 12 8
## [409] 21 8 10 30 17 11 12 20 22 14 4 11 1 4 18 20 12 19 15 21 16 18 11 16
## [433] 17 5 16 23 7 5 19 9 22 22 8 10 19 11 11 7 12 8 13 12 24 12 16 16
## [457] 20 13 3 30 22 35 11 10 7 30 29 14 12 17 2 24 15 10 9 12 4 19 23 28
## [481] 17 14 27 12 16 11 19 22 16 8 9 10 20 15 4 24 19 18 12 4 9 17 22 2
## [505] 11 6 2 16 21 20 12 17 21 7 15 13 6 4 9 30 2 29 13 3 8 5 14 6
## [529] 15 1 14 11 21 34 11 18 13 7 11 11 4 16 19 20 12 13 30 19 17 12 12 15
## [553] 12 9 12 6 8 2 12 28 14 24 24 4 26 7 18 8 18 13 22 9 28 21 10 13
## [577] 6 16 12 2 15 16 17 17 23 25 47 21 19 18 18 3 31 11 2 16 14 16 23 13
## [601] 14 7 10 7 16 25 28 10 16 22 23 21 21 19 6 2 8 6 12 2 5 14 15 8
## [625] 3 23 19 19 15 7 7 14 10 10 14 31 17 25 7 12 15 12 4 17 19 24 14 24
## [649] 6 25 6 19 34 16 3 7 15 19 2 26 9 21 2 9 13 10 17 13 4 15 5 13
## [673] 21 5 18 6 21 11 18 11 7 4 5 5 13 18 17 5 7 14 23 11 15 13 13 10
## [697] 43 47 20 14 9 26 7 12 28 1 5 18 18 12 11 20 41 23 10 20 16 21 26 12
## [721] 6 17 12 11 20 22 26 10 8 15 11 4 16 13 21 14 17 9 10 8 3 12 22 37
## [745] 18 7 11 23 13 25 12 17 29 1 12 7 1 23 4 10 25 13 1 9 9 11 26 9
## [769] 18 3 8 24 9 14 24 18 2 15 24 9 23 27 16 15 18 16 17 25 15 15 18 15
## [793] 12 2 13 20 9 13 16 15 15 9 23 13 2 20 19 29 22 2 15 9 23 7 12 3
## [817] 13 1 31 15 20 12 18 10 15 7 23 24 13 8 12 13 20 7 22 4 11 14 16 17
## [841] 10 13 11 19 17 15 25 15 19 9 20 16 14 6 14 11 13 16 7 17 16 13 5 35
## [865] 1 20 20 5 5 26 15 15 16 20 13 14 7 15 27 22 30 4 34 1 20 14 18 10
## [889] 16 8 29 4 9 8 3 10 25 18 20 9 17 28 15 20 3 18 9 21 14 7 1 12
## [913] 15 6 17 35 13 25 18 8 16 26 5 8 9 25 27 13 3 16 29 6 14 3 17 16
## [937] 17 32 8 16 3 2 16 20 24 13 19 19 11 3 14 12 8 6 1 23 7 2 4 13
## [961] 12 6 37 15 17 2 14 11 3 12 27 6 19 4 19 12 14 18 15 12 20 10 21 8
## [985] 17 17 8 20 15 10 6 21 6 8 17 24 7 13 14 27 15 23 15 11 1 24 13 26
## [1009] 16 18 5 16 28 19 12 14 14 23 17 18 31 12 11 9 20 16 18 18 2 13 4 14
## [1033] 12 13 4 3 2 8 11 3 2 2 2 13 10 1 16 12 18 5 28 16 13 15 7 22
## [1057] 17 11 10 16 14 9 8 16 15 6 18 14 15 14 3 22 38 3 5 17 9 23 14 13
## [1081] 9 24 9 9 6 16 18 17 10 24 8 27 7 22 10 12 28 14 9 7 17 26 11 23
## [1105] 9 11 12 32 32 6 13 16 8 19 37 3 18 14 10 21 13 12 15 9 6 7 9 13
## [1129] 31 13 18 4 6 18 12 3 16 11 3 29 26 14 9 8 29 18 27 11 20 19 9 16
## [1153] 13 3 9 2 12 10 7 12 25 26 18 14 17 18 9 9 12 13 14 18 30 14 6 15
## [1177] 2 19 16 14 9 10 9 5 7 11 4 14 10 4 9 12 12 3 16 16 17 24 9 11
## [1201] 16 19 11 14 3 4 5 5 17 12 16 22 11 10 8 18 21 10 4 25 3 19 14 16
## [1225] 15 16 16 11 18 13 11 12
chart.Correlation(cortable, histogram=TRUE, pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
We have to choice 1 between differentiate - grade t stage - tumor size n
stage - regional_node_possitive - x6th_stage_num
muldata = project_2_numdata |>
select(-differentiate, -x6th_stage_num)
cox_model <- coxph(Surv(survival_months, status) ~.,
data = muldata)
vifs <- vif(cox_model)
## Warning in vif.default(cox_model): No intercept: vifs may not be sensible.
print(vifs)
## GVIF Df GVIF^(1/(2*Df))
## age 1.116360 1 1.056580
## race 1.122842 2 1.029389
## marital_status 1.299150 4 1.033255
## t_stage 4.253731 1 2.062458
## n_stage 16.186662 1 4.023265
## x6th_stage 43.299742 4 1.601624
## grade 1.132178 1 1.064038
## tumor_size 2.918845 1 1.708463
## estrogen_status 1.725718 1 1.313666
## progesterone_status 1.579248 1 1.256681
## regional_node_examined 1.772873 1 1.331493
## reginol_node_positive 4.389820 1 2.095190
## a_stage_regional 1.358703 1 1.165634
The VIF analysis reveals significant multicollinearity for n_stage (VIF = 10.94, GVIF^(1/(2Df)) =3.31), x6th_stage_num (VIF = 1.56, GVIF^(1/(2Df)) =1.25), reginol_node_positive (VIF = 4.13, GVIF^(1/(2Df)) =2.03), and t_stage (VIF = 3.83, GVIF^(1/(2Df)) =1.96), indicating redundancy. Tumor_size shows moderate multicollinearity (VIF = 2.62, GVIF^(1/(2Df)) =1.62), while other variables have acceptable VIFs near 1. Variables with high multicollinearity should be reconsidered for exclusion. also shows we need to remove. So I decided to remove t_stage, n_stage, reginol_node_positive,
final = muldata |>
select(-t_stage, -n_stage, -reginol_node_positive)
cox_model <- coxph(Surv(survival_months, status) ~.,
data = final)
vifs <- vif(cox_model)
## Warning in vif.default(cox_model): No intercept: vifs may not be sensible.
print(vifs)
## GVIF Df GVIF^(1/(2*Df))
## age 1.104634 1 1.051016
## race 1.117617 2 1.028190
## marital_status 1.252430 4 1.028535
## x6th_stage 2.194858 4 1.103255
## grade 1.124856 1 1.060592
## tumor_size 1.302777 1 1.141393
## estrogen_status 1.664705 1 1.290234
## progesterone_status 1.542471 1 1.241962
## regional_node_examined 1.260509 1 1.122724
## a_stage_regional 1.350791 1 1.162235
library(survival)
ph_test <- cox.zph(cox_model)
print(ph_test)
## chisq df p
## age 0.1735 1 0.68
## race 0.7005 2 0.70
## marital_status 4.2550 4 0.37
## x6th_stage 7.5261 4 0.11
## grade 0.0909 1 0.76
## tumor_size 0.1020 1 0.75
## estrogen_status 22.6787 1 1.9e-06
## progesterone_status 19.8507 1 8.4e-06
## regional_node_examined 0.1197 1 0.73
## a_stage_regional 0.6921 1 0.41
## GLOBAL 49.9364 17 4.3e-05
plot(ph_test)
The Cox model assumes that hazard ratios are constant over time. A non-significant p-value (p > 0.05) indicates that the PH assumption holds. As GLOBAL 50.520 14 5.0e-06, the model did not meet the assumption, as same as estrogen_status, progesterone_status, and a_stage_regional. We need further improve our model.
finalmodel <- coxph(Surv(survival_months, status) ~ age + race + marital_status +
grade + tumor_size +
regional_node_examined + x6th_stage,
data = final)
ph_test <- cox.zph(finalmodel)
print(ph_test)
## chisq df p
## age 3.20e-01 1 0.57
## race 5.65e-01 2 0.75
## marital_status 5.40e+00 4 0.25
## grade 2.01e-01 1 0.65
## tumor_size 4.63e-04 1 0.98
## regional_node_examined 2.96e-02 1 0.86
## x6th_stage 6.12e+00 4 0.19
## GLOBAL 1.31e+01 14 0.52
plot(ph_test)
cox_model$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.518730e+05 1.552820e+05 0.000000e+00 2.219000e+03 0.000000e+00 6.938175e-01
## std
## 1.137217e-02
finalmodel$concordance
## concordant discordant tied.x tied.y tied.xy concordance
## 3.397010e+05 1.674530e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.698179e-01
## std
## 1.162855e-02
No Clear Trends: If the solid line remains flat (close to zero), it indicates that the PH assumption is satisfied for that variable. Upward/Downward Trends: A visible trend or deviation indicates that the proportional hazards assumption may be violated for the corresponding variable, suggesting time-dependent effects.
A C-index of 0.716 suggests that the model has good discriminatory power, meaning it can correctly rank the survival times for about 71.6% of the pairs.The standard error is 0.01075, indicating a narrow range of variability in the concordance estimate, suggesting robust performance. The reduction in the C-index (from 73.9% to 71.6%) indicates a trade-off between model complexity and performance.
dev_residuals <- residuals(cox_model, type = "deviance")
plot(dev_residuals, main = "Deviance Residuals", ylab = "Residuals", xlab = "Index")
abline(h = c(-2, 2), col = "red", lty = 2)
surv_fit <- survfit(Surv(survival_months, status) ~ 1, data = final)
plot(surv_fit, xlab = "Time (months)", ylab = "Survival Probability",
main = "Survival Curve for the Final Model", col = "blue", lwd = 2)
grid()